Plots
Use Cases
When to Use Each Type
import plotly.express as px
import pandas as pd
from IPython.display import Image
from IPython.display import IFrame
object being countedcount or total# Table 2.1
IFrame(src='https://clauswilke.com/dataviz/aesthetic-mapping.html#tab:basic-data-types',width=1000,height=800)
df = px.data.tips()
df
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
# counting qualitative data
px.bar(df,x='sex')
# total tip amount by sex
px.bar(df,y='sex')
# total tip amount by sex - note each slice represents the record value
px.bar(df,x='sex',y='tip')
# total tip amount by sex
px.bar(df,y='sex',x='tip')
# This is incorrect. Plot a single continuous variable with a histgram
px.bar(df,y='tip')
# Use a histogram to look at the distibution of continuous data
px.histogram(df,x='tip')
# Use a scatter plot for two continuous variables
px.bar(df,y='tip',x='total_bill')
'''
Practice Exercise: Use the tips dataset to make two plots.
1. Make a bar plot that counts the number occurrences of a discrete variable.
2. Make a bar plot that breaks down a continuous variable by one discrete variable.
'''
df
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
# Answer here
If we have two or more sets of categories for which we want to show amounts, we can group or stack the bars.
df
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
# Map a second set of values to the color property
# Default is stacked bars
px.bar(df,x='sex',color='day')
# Add barmode to get a grouped plot
px.bar(df,x='sex',color='day',barmode='group')
# Breakdown a continuous variable by 2 qualitative variables
px.bar(df,x='sex',y='tip',color='day',barmode='group')
'''
Practice Exercise: Use the tips dataset to make one plot.
1. Make a bar plot that breaks down a total_bill by two discrete variables.
'''
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
# Answer here
Image(url='https://clauswilke.com/dataviz/visualizing_amounts_files/figure-html/titanic-passengers-by-class-sex-1.png',width=600)
df
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
# Get counts for all combinations as a series
df[['smoker','sex']].value_counts()
smoker sex No Male 97 Yes Male 60 No Female 54 Yes Female 33 dtype: int64
# Get counts for all combinations as a dataframe
df[['smoker','sex']].value_counts().to_frame().reset_index()
| smoker | sex | 0 | |
|---|---|---|---|
| 0 | No | Male | 97 |
| 1 | Yes | Male | 60 |
| 2 | No | Female | 54 |
| 3 | Yes | Female | 33 |
df_agg = df[['smoker','sex']].value_counts().to_frame().reset_index().rename(columns={0:'count'})
df_agg
| smoker | sex | count | |
|---|---|---|---|
| 0 | No | Male | 97 |
| 1 | Yes | Male | 60 |
| 2 | No | Female | 54 |
| 3 | Yes | Female | 33 |
px.bar(df_agg,
x='sex',
y='count',
color='smoker')
px.bar(df,
x='sex',
color='smoker')
scatter plots with one categorical axis (Y) and one continuous axis (X). Image(url='https://drive.google.com/uc?id=1aNIYjsmxpWPZEYHdPMhckRh2fq6_auN1')
df
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
df.day.value_counts()
Sat 87 Sun 76 Thur 62 Fri 19 Name: day, dtype: int64
# Counting bar chart with 1 categorical and 1 continuous variable
px.bar(df,x='day',y='tip')
# We can't just convert a bar to a scatter - we'll just show each record
px.scatter(df,y='day',x='tip')
Pandas Group By
Group a dataframe by a categorical variable and compute some aggregate function on a continuous variable
# Group rows by by sex, then apply a count function to the tip, then combine into one structure
df.groupby(by='sex')['tip'].sum()
sex Female 246.51 Male 485.07 Name: tip, dtype: float64
# Can have multiple categories, compute a function across multiple continuous variables, and select one of many functions
df.groupby(by=['day','sex'])['tip','total_bill'].mean()
/var/folders/kh/plns67p91gv9v06gw7hpyxg80000gn/T/ipykernel_93291/1964103464.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| tip | total_bill | ||
|---|---|---|---|
| day | sex | ||
| Fri | Female | 2.781111 | 14.145556 |
| Male | 2.693000 | 19.857000 | |
| Sat | Female | 2.801786 | 19.680357 |
| Male | 3.083898 | 20.802542 | |
| Sun | Female | 3.367222 | 19.872222 |
| Male | 3.220345 | 21.887241 | |
| Thur | Female | 2.575625 | 16.715312 |
| Male | 2.980333 | 18.714667 |
df_agg = df.groupby(by=['day','smoker'])['tip'].sum().to_frame().reset_index()
df_agg
| day | smoker | tip | |
|---|---|---|---|
| 0 | Fri | No | 11.25 |
| 1 | Fri | Yes | 40.71 |
| 2 | Sat | No | 139.63 |
| 3 | Sat | Yes | 120.77 |
| 4 | Sun | No | 180.57 |
| 5 | Sun | Yes | 66.82 |
| 6 | Thur | No | 120.32 |
| 7 | Thur | Yes | 51.51 |
# Use aggregated data to plot the combined values
px.scatter(df_agg,
y='day',
x='tip',
color='smoker')
'''
Practice Exercise: Use groupby to aggregate.
1. Make a bar plot that breaks down a total_bill by two discrete variables.
'''
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
# Answer here
With px.imshow, each value of the data frame is represented as a heatmap pixel
# load stocks with continuous data
df = px.data.stocks()
df
| date | GOOG | AAPL | AMZN | FB | NFLX | MSFT | |
|---|---|---|---|---|---|---|---|
| 0 | 2018-01-01 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 1 | 2018-01-08 | 1.018172 | 1.011943 | 1.061881 | 0.959968 | 1.053526 | 1.015988 |
| 2 | 2018-01-15 | 1.032008 | 1.019771 | 1.053240 | 0.970243 | 1.049860 | 1.020524 |
| 3 | 2018-01-22 | 1.066783 | 0.980057 | 1.140676 | 1.016858 | 1.307681 | 1.066561 |
| 4 | 2018-01-29 | 1.008773 | 0.917143 | 1.163374 | 1.018357 | 1.273537 | 1.040708 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 100 | 2019-12-02 | 1.216280 | 1.546914 | 1.425061 | 1.075997 | 1.463641 | 1.720717 |
| 101 | 2019-12-09 | 1.222821 | 1.572286 | 1.432660 | 1.038855 | 1.421496 | 1.752239 |
| 102 | 2019-12-16 | 1.224418 | 1.596800 | 1.453455 | 1.104094 | 1.604362 | 1.784896 |
| 103 | 2019-12-23 | 1.226504 | 1.656000 | 1.521226 | 1.113728 | 1.567170 | 1.802472 |
| 104 | 2019-12-30 | 1.213014 | 1.678000 | 1.503360 | 1.098475 | 1.540883 | 1.788185 |
105 rows × 7 columns
# Each value is a color bar - raw values
px.imshow(df,width=700,height=700)
# Use pandas correlation function
df.corr()
| GOOG | AAPL | AMZN | FB | NFLX | MSFT | |
|---|---|---|---|---|---|---|
| GOOG | 1.000000 | 0.833629 | 0.556702 | 0.633169 | 0.140254 | 0.747029 |
| AAPL | 0.833629 | 1.000000 | 0.560877 | 0.493498 | 0.049519 | 0.786771 |
| AMZN | 0.556702 | 0.560877 | 1.000000 | 0.341430 | 0.619946 | 0.660896 |
| FB | 0.633169 | 0.493498 | 0.341430 | 1.000000 | 0.265663 | 0.472227 |
| NFLX | 0.140254 | 0.049519 | 0.619946 | 0.265663 | 1.000000 | 0.079532 |
| MSFT | 0.747029 | 0.786771 | 0.660896 | 0.472227 | 0.079532 | 1.000000 |
# Matrix of values
px.imshow(df.corr())
import numpy as np
colors = np.random.random(9*16).reshape(9,16)
colors
array([[0.00661908, 0.38889081, 0.82299527, 0.29400284, 0.51999436,
0.92975834, 0.16530968, 0.87993949, 0.62508173, 0.68918127,
0.75492581, 0.44298281, 0.72560364, 0.71857954, 0.42530702,
0.38166019],
[0.55843956, 0.09340349, 0.06804757, 0.56337219, 0.17338547,
0.15427628, 0.13325399, 0.72909297, 0.76820082, 0.97933499,
0.95486782, 0.29918547, 0.92712032, 0.24453814, 0.15759402,
0.94983585],
[0.46073902, 0.47604746, 0.26661154, 0.4310107 , 0.8657953 ,
0.90104762, 0.43565712, 0.02279515, 0.27638027, 0.46970307,
0.15344466, 0.78965919, 0.19433078, 0.30735017, 0.82680199,
0.60376475],
[0.6945631 , 0.8436158 , 0.35036677, 0.36266975, 0.446323 ,
0.01392668, 0.01973868, 0.64562098, 0.8615941 , 0.03255129,
0.74675502, 0.16225547, 0.0091061 , 0.70497683, 0.11507202,
0.23814179],
[0.52843522, 0.40524668, 0.05614435, 0.26303703, 0.31930379,
0.83882764, 0.30817174, 0.4568098 , 0.9796798 , 0.68190925,
0.5803533 , 0.91400869, 0.18450815, 0.1643243 , 0.09157274,
0.14142348],
[0.11454467, 0.22625487, 0.34165394, 0.02946669, 0.36934826,
0.19494686, 0.46765759, 0.20477195, 0.49988452, 0.93330036,
0.9748635 , 0.94530959, 0.00620233, 0.37153224, 0.83672787,
0.63673004],
[0.96753576, 0.96360787, 0.65414457, 0.79004722, 0.80943304,
0.61042928, 0.64677509, 0.55435694, 0.20431713, 0.09773841,
0.3581671 , 0.5596828 , 0.71933434, 0.36892599, 0.65870409,
0.03320889],
[0.21118429, 0.31646121, 0.99402624, 0.46530278, 0.46162795,
0.18405642, 0.76823483, 0.69429339, 0.13512095, 0.81381762,
0.37039597, 0.10622007, 0.07123317, 0.35011568, 0.43613387,
0.17248956],
[0.27848561, 0.67316211, 0.92892431, 0.19603397, 0.05025232,
0.43824633, 0.92738859, 0.97291609, 0.92169445, 0.45274421,
0.49993208, 0.61337845, 0.38342769, 0.17913624, 0.76408922,
0.0781835 ]])
px.imshow(colors)